Passengers Data
Load data
dat <- import(here("data", "dat.csv")) %>%
filter(Category=="Passenger") %>%
clean_names() %>%
mutate_all(na_if,"") %>%
drop_na(survived, gender, class, age)
Clean data
dat$gender <- as.factor(dat$gender)
dat$marital_status <- as.factor(dat$marital_status)
dat$category <- as.factor(dat$category)
dat$class <- as.factor(dat$class)
dat$survived <- as.factor(dat$survived)
dat$embarked <- as.factor(dat$embarked)
dat$disembarked <- as.factor(dat$disembarked)
dat <- dat %>%
mutate(nationality2 = case_when(nationality == "English" ~ "English",
nationality == "Irish" ~ "Irish",
nationality == "American" ~ "American",
nationality == "Swedish" ~ "Swedish",
nationality == "Finnish" ~ "Finnish",
nationality == "Scottish" ~ "Scottish",
nationality == "French" ~ "French",
nationality == "Italian" ~ "Italian",
nationality == "Canadian" ~ "Canadian",
nationality == "Bulgarian" ~ "Bulgarian",
nationality == "Croatian" ~ "Croatian",
nationality == "Belgian" ~ "Belgian",
nationality == "Norwegian" ~ "Norwegian",
nationality == "Channel Islander" ~ "Channel Islander",
nationality == "Welsh" ~ "Welsh",
nationality == "Swiss" ~ "Swiss",
nationality == "German" ~ "German",
nationality == "Danish" ~ "Danish",
nationality == "Spanish" ~ "Spanish",
nationality == "Australian" ~ "Australian",
nationality == "Polish" ~ "Polish",
nationality == "South African" ~ "South African",
nationality == "Bosnian" ~ "Bosnian",
nationality == "Hong Kongese" ~ "Hong Kongese",
nationality == "Dutch" ~ "Dutch",
nationality == "Lithuanian" ~ "Lithuanian",
nationality == "Greek" ~ "Greek",
nationality == "Portuguese" ~ "Portuguese",
nationality == "Uruguayan" ~ "Uruguayan",
nationality == "Chinese" ~ "Chinese",
nationality == "Slovenian" ~ "Slovenian",
nationality == "Cape Verdean" ~ "Cape Verdean",
nationality == "Egyptian" ~ "Egyptian",
nationality == "Japanese" ~ "Japanese",
nationality == "Hungarian" ~ "Hungarian",
nationality == "Bosnian" ~ "Bosnian",
nationality == "Hong Kongese" ~ "Hong Kongese",
nationality == "Latvian" ~ "Latvian",
nationality == "Austrian" ~ "Austrian",
nationality == "Greek" ~ "Greek",
nationality == "Mexican" ~ "Mexican",
nationality == "Sweden" ~ "Sweedish",
nationality == "Turkish" ~ "Turkish",
nationality == "Slovenian" ~ "Slovenian",
nationality == "Guyanese" ~ "Guyanese",
nationality == "Haitian" ~ "Haitian",
nationality == "Syrian,Lebanese" ~ "Syrian/Lebanese",
nationality == "Unknown" ~ "Unknown",
TRUE ~ "Other - Multiple", ))
dat <- dat %>%
mutate(nationality2 = ifelse(nationality2 == "Unknown", NA, nationality2))
Descriptives
# Breakdown of passengers by class
dat %>%
group_by(class) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
adorn_totals() %>%
kable(caption = "Breakdown of Passengers by Class and Gender",
col.names = c("Class", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
Breakdown of Passengers by Class and Gender
|
Class
|
Count
|
Percent
|
|
1st Class
|
324
|
24.64
|
|
2nd Class
|
284
|
21.60
|
|
3rd Class
|
707
|
53.76
|
|
Total
|
1315
|
100.00
|
# Breakdown of passengers by class and gender
dat %>%
group_by(class, gender) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
kable(caption = "Breakdown of Passengers by Class and Gender",
col.names = c("Class", "Gender", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
Breakdown of Passengers by Class and Gender
|
Class
|
Gender
|
Count
|
Percent
|
|
1st Class
|
Female
|
144
|
44.44
|
|
1st Class
|
Male
|
180
|
55.56
|
|
2nd Class
|
Female
|
106
|
37.32
|
|
2nd Class
|
Male
|
178
|
62.68
|
|
3rd Class
|
Female
|
216
|
30.55
|
|
3rd Class
|
Male
|
491
|
69.45
|
# Breakdown of passenger nationalities
dat %>%
filter(!is.na(nationality2)) %>%
group_by(nationality2) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(desc(percent)) %>%
kable(caption = "Breakdown of Passenger Nationalities",
col.names = c("Nationality", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
Breakdown of Passenger Nationalities
|
Nationality
|
Count
|
Percent
|
|
English
|
295
|
22.43
|
|
American
|
242
|
18.40
|
|
Irish
|
122
|
9.28
|
|
Other - Multiple
|
108
|
8.21
|
|
Swedish
|
99
|
7.53
|
|
Syrian/Lebanese
|
85
|
6.46
|
|
Finnish
|
58
|
4.41
|
|
Canadian
|
37
|
2.81
|
|
Bulgarian
|
31
|
2.36
|
|
Croatian
|
28
|
2.13
|
|
French
|
26
|
1.98
|
|
Norwegian
|
26
|
1.98
|
|
Belgian
|
25
|
1.90
|
|
Scottish
|
17
|
1.29
|
|
Channel Islander
|
15
|
1.14
|
|
Swiss
|
13
|
0.99
|
|
Danish
|
10
|
0.76
|
|
Italian
|
9
|
0.68
|
|
German
|
8
|
0.61
|
|
Spanish
|
8
|
0.61
|
|
Welsh
|
8
|
0.61
|
|
Polish
|
6
|
0.46
|
|
Bosnian
|
4
|
0.30
|
|
Hong Kongese
|
4
|
0.30
|
|
South African
|
4
|
0.30
|
|
Greek
|
3
|
0.23
|
|
Lithuanian
|
3
|
0.23
|
|
Uruguayan
|
3
|
0.23
|
|
Australian
|
2
|
0.15
|
|
Chinese
|
2
|
0.15
|
|
Portuguese
|
2
|
0.15
|
|
Slovenian
|
2
|
0.15
|
|
Austrian
|
1
|
0.08
|
|
Dutch
|
1
|
0.08
|
|
Egyptian
|
1
|
0.08
|
|
Haitian
|
1
|
0.08
|
|
Hungarian
|
1
|
0.08
|
|
Japanese
|
1
|
0.08
|
|
Latvian
|
1
|
0.08
|
|
Mexican
|
1
|
0.08
|
|
Sweedish
|
1
|
0.08
|
|
Turkish
|
1
|
0.08
|
# Breakdown of passenger nationalities by class
dat %>%
filter(!is.na(nationality2)) %>%
group_by(class, nationality2) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(class, desc(percent)) %>%
kable(caption = "Breakdown of Passenger Nationalities by Class (All)",
col.names = c("Class", "Nationality", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
Breakdown of Passenger Nationalities by Class (All)
|
Class
|
Nationality
|
Count
|
Percent
|
|
1st Class
|
American
|
195
|
60.19
|
|
1st Class
|
English
|
38
|
11.73
|
|
1st Class
|
Canadian
|
27
|
8.33
|
|
1st Class
|
Other - Multiple
|
14
|
4.32
|
|
1st Class
|
French
|
10
|
3.09
|
|
1st Class
|
Swiss
|
6
|
1.85
|
|
1st Class
|
German
|
5
|
1.54
|
|
1st Class
|
Irish
|
5
|
1.54
|
|
1st Class
|
Spanish
|
4
|
1.23
|
|
1st Class
|
Swedish
|
4
|
1.23
|
|
1st Class
|
Scottish
|
3
|
0.93
|
|
1st Class
|
Uruguayan
|
3
|
0.93
|
|
1st Class
|
Belgian
|
2
|
0.62
|
|
1st Class
|
Italian
|
2
|
0.62
|
|
1st Class
|
Channel Islander
|
1
|
0.31
|
|
1st Class
|
Dutch
|
1
|
0.31
|
|
1st Class
|
Egyptian
|
1
|
0.31
|
|
1st Class
|
Mexican
|
1
|
0.31
|
|
1st Class
|
Norwegian
|
1
|
0.31
|
|
1st Class
|
Polish
|
1
|
0.31
|
|
2nd Class
|
English
|
145
|
51.06
|
|
2nd Class
|
Other - Multiple
|
25
|
8.80
|
|
2nd Class
|
American
|
24
|
8.45
|
|
2nd Class
|
Channel Islander
|
12
|
4.23
|
|
2nd Class
|
Irish
|
12
|
4.23
|
|
2nd Class
|
French
|
11
|
3.87
|
|
2nd Class
|
Scottish
|
8
|
2.82
|
|
2nd Class
|
Finnish
|
6
|
2.11
|
|
2nd Class
|
Swedish
|
6
|
2.11
|
|
2nd Class
|
Canadian
|
5
|
1.76
|
|
2nd Class
|
South African
|
4
|
1.41
|
|
2nd Class
|
Spanish
|
4
|
1.41
|
|
2nd Class
|
Danish
|
3
|
1.06
|
|
2nd Class
|
Italian
|
3
|
1.06
|
|
2nd Class
|
Lithuanian
|
2
|
0.70
|
|
2nd Class
|
Swiss
|
2
|
0.70
|
|
2nd Class
|
Syrian/Lebanese
|
2
|
0.70
|
|
2nd Class
|
Welsh
|
2
|
0.70
|
|
2nd Class
|
Australian
|
1
|
0.35
|
|
2nd Class
|
Belgian
|
1
|
0.35
|
|
2nd Class
|
German
|
1
|
0.35
|
|
2nd Class
|
Haitian
|
1
|
0.35
|
|
2nd Class
|
Hungarian
|
1
|
0.35
|
|
2nd Class
|
Japanese
|
1
|
0.35
|
|
2nd Class
|
Norwegian
|
1
|
0.35
|
|
2nd Class
|
Portuguese
|
1
|
0.35
|
|
3rd Class
|
English
|
112
|
15.84
|
|
3rd Class
|
Irish
|
105
|
14.85
|
|
3rd Class
|
Swedish
|
89
|
12.59
|
|
3rd Class
|
Syrian/Lebanese
|
83
|
11.74
|
|
3rd Class
|
Other - Multiple
|
69
|
9.76
|
|
3rd Class
|
Finnish
|
52
|
7.36
|
|
3rd Class
|
Bulgarian
|
31
|
4.38
|
|
3rd Class
|
Croatian
|
28
|
3.96
|
|
3rd Class
|
Norwegian
|
24
|
3.39
|
|
3rd Class
|
American
|
23
|
3.25
|
|
3rd Class
|
Belgian
|
22
|
3.11
|
|
3rd Class
|
Danish
|
7
|
0.99
|
|
3rd Class
|
Scottish
|
6
|
0.85
|
|
3rd Class
|
Welsh
|
6
|
0.85
|
|
3rd Class
|
Canadian
|
5
|
0.71
|
|
3rd Class
|
French
|
5
|
0.71
|
|
3rd Class
|
Polish
|
5
|
0.71
|
|
3rd Class
|
Swiss
|
5
|
0.71
|
|
3rd Class
|
Bosnian
|
4
|
0.57
|
|
3rd Class
|
Hong Kongese
|
4
|
0.57
|
|
3rd Class
|
Italian
|
4
|
0.57
|
|
3rd Class
|
Greek
|
3
|
0.42
|
|
3rd Class
|
Channel Islander
|
2
|
0.28
|
|
3rd Class
|
Chinese
|
2
|
0.28
|
|
3rd Class
|
German
|
2
|
0.28
|
|
3rd Class
|
Slovenian
|
2
|
0.28
|
|
3rd Class
|
Australian
|
1
|
0.14
|
|
3rd Class
|
Austrian
|
1
|
0.14
|
|
3rd Class
|
Latvian
|
1
|
0.14
|
|
3rd Class
|
Lithuanian
|
1
|
0.14
|
|
3rd Class
|
Portuguese
|
1
|
0.14
|
|
3rd Class
|
Sweedish
|
1
|
0.14
|
|
3rd Class
|
Turkish
|
1
|
0.14
|
# Average age by class
dat %>%
group_by(class) %>%
summarize(avg_age = mean(age), std_age = sd(age), min_age = min(age),
max_age = max(age)) %>%
kable(caption = "Average Age by Class",
col.names = c("Class", "Average Age", "SD Age", "Minimum Age", "Maximum Age"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
Average Age by Class
|
Class
|
Average Age
|
SD Age
|
Minimum Age
|
Maximum Age
|
|
1st Class
|
39.14
|
13.55
|
0
|
71
|
|
2nd Class
|
30.01
|
13.90
|
0
|
71
|
|
3rd Class
|
25.12
|
11.71
|
0
|
74
|
Examining Survival
Survival rates
# Survival rate by class
dat %>%
group_by(class, survived) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(class, survived) %>%
kable(caption = "Survival Rate by Class",
col.names = c("Class", "Survived", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
Survival Rate by Class
|
Class
|
Survived
|
Count
|
Percent
|
|
1st Class
|
Lost
|
123
|
37.96
|
|
1st Class
|
Saved
|
201
|
62.04
|
|
2nd Class
|
Lost
|
166
|
58.45
|
|
2nd Class
|
Saved
|
118
|
41.55
|
|
3rd Class
|
Lost
|
526
|
74.40
|
|
3rd Class
|
Saved
|
181
|
25.60
|
# Survival rate by gender
dat %>%
group_by(gender, survived) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(gender, survived) %>%
kable(caption = "Survival Rate by Gender",
col.names = c("Gender", "Survived", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
Survival Rate by Gender
|
Gender
|
Survived
|
Count
|
Percent
|
|
Female
|
Lost
|
127
|
27.25
|
|
Female
|
Saved
|
339
|
72.75
|
|
Male
|
Lost
|
688
|
81.04
|
|
Male
|
Saved
|
161
|
18.96
|
# Survival rate by class and gender
dat %>%
group_by(class, gender, survived) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(class, gender) %>%
kable(caption = "Survival Rate by Class and Gender",
col.names = c("Class", "Gender", "Survived", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
Survival Rate by Class and Gender
|
Class
|
Gender
|
Survived
|
Count
|
Percent
|
|
1st Class
|
Female
|
Lost
|
5
|
3.47
|
|
1st Class
|
Female
|
Saved
|
139
|
96.53
|
|
1st Class
|
Male
|
Lost
|
118
|
65.56
|
|
1st Class
|
Male
|
Saved
|
62
|
34.44
|
|
2nd Class
|
Female
|
Lost
|
12
|
11.32
|
|
2nd Class
|
Female
|
Saved
|
94
|
88.68
|
|
2nd Class
|
Male
|
Lost
|
154
|
86.52
|
|
2nd Class
|
Male
|
Saved
|
24
|
13.48
|
|
3rd Class
|
Female
|
Lost
|
110
|
50.93
|
|
3rd Class
|
Female
|
Saved
|
106
|
49.07
|
|
3rd Class
|
Male
|
Lost
|
416
|
84.73
|
|
3rd Class
|
Male
|
Saved
|
75
|
15.27
|
Density ridges
surv_classhist <- dat %>%
ggplot(aes(age, class)) +
geom_density_ridges(aes(fill = factor(survived))) +
labs(title = "Age Distribution of Survival Status By Class",
x = "Age Distribution", y = "Passenger Class") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
surv_classhist + scale_fill_manual(name = "Survival", values = c("black","dark red"))

surv_agehist <- dat %>%
ggplot(aes(age, gender)) +
geom_density_ridges(aes(fill = factor(survived))) +
labs(title = "Age Distribution of Survival Status By Gender",
x = "Age Distribution", y = "Passenger Gender") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
surv_agehist + scale_fill_manual(name = "Survival", values = c("black","dark red"))

surv_ageclass_hist <- dat %>%
ggplot(aes(age, gender)) +
facet_wrap(~class, nrow=3) +
geom_density_ridges(aes(fill = factor(survived))) +
labs(title = "Age Distribution of Survival Status By Class and Gender",
x = "Age Distribution", y = "Passenger Gender") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
surv_ageclass_hist + scale_fill_manual(name = "Survival", values = c("black","dark red"))

Predicting survival
modeldat <- dat %>%
select(survived, gender, class, age)
ctree <- ctree(survived ~ gender + class + age, data=modeldat)
plot(ctree)

Tickets Prices
Load data
fares <- import(here("data", "avgfare.csv")) %>%
clean_names()
fares$accommodation <- as.factor(fares$accommodation)
fares$accommodation <- factor(fares$accommodation, levels = c("First-class parlor suite", "First-class cabin", "Second-class cabin", "Third-class cabin"))
Calculate inflation
p1921 <- (17.9/9.7)
fares$fare_1921 <- p1921*fares$fare_1912
fares$fare_1921 <- round(fares$fare_1921, 2)
p1931 <- (15.2/9.7)
fares$fare_1931 <- p1931*fares$fare_1912
fares$fare_1931 <- round(fares$fare_1931, 2)
p1941 <- (14.7/9.7)
fares$fare_1941 <- p1941*fares$fare_1912
fares$fare_1941 <- round(fares$fare_1941, 2)
p1951 <- (26.0/9.7)
fares$fare_1951 <- p1951*fares$fare_1912
fares$fare_1951 <- round(fares$fare_1951, 2)
p1961 <- (29.9/9.7)
fares$fare_1961 <- p1961*fares$fare_1912
fares$fare_1961 <- round(fares$fare_1961, 2)
p1971 <- (40.5/9.7)
fares$fare_1971 <- p1971*fares$fare_1912
fares$fare_1971 <- round(fares$fare_1971, 2)
p1981 <- (90.9/9.7)
fares$fare_1981 <- p1981*fares$fare_1912
fares$fare_1981 <- round(fares$fare_1981, 2)
p1991 <- (136.2/9.7)
fares$fare_1991 <- p1991*fares$fare_1912
fares$fare_1991 <- round(fares$fare_1991, 2)
p2001 <- (177.1/9.7)
fares$fare_2001 <- p2001*fares$fare_1912
fares$fare_2001 <- round(fares$fare_2001, 2)
p2011 <- (224.9/9.7)
fares$fare_2011 <- p2011*fares$fare_1912
fares$fare_2011 <- round(fares$fare_2011, 2)
p2021 <- (274.3/9.7)
fares$fare_2021 <- p2021*fares$fare_1912
fares$fare_2021 <- round(fares$fare_2021, 2)
Reshape data
fares_tidy <- fares %>%
pivot_longer(cols = starts_with("fare"),
names_to = "year",
names_prefix = "fare_",
values_to = "price", names_transform = list(year = as.integer))
Inflation-adjustment plot
fare_graph <- fares_tidy %>%
ggplot(aes(year, price, colour=accommodation)) +
geom_line() +
geom_point() +
scale_colour_brewer(palette="Spectral") +
facet_wrap(~ accommodation, 4, scales = "free") +
xlim(1912,2021) +
theme(panel.spacing = unit(1, "lines")) +
labs(y = "Price ($USD)", x = "Year", title = "Inflation-Adjusted Titanic Ticket Prices",
subtitle = "From 1912 to 2021", colour = "Accommodation") +
theme_minimal()
ggplotly(fare_graph)
When taking inflation rates into consideration, we see that the average price for a first class cabin in 1912 was $150.00, which today would be $4,241.74